Linear Regressions

I created several linear regression models using temperature as the dependent variable.

Temperature vs. Low Cloud Coverage

temp_lowc <- lm(temperature ~ cloudlow,data = combine)
temp_lowc %>% tidy() %>% as.data.frame()
##          term    estimate std.error statistic      p.value
## 1 (Intercept) 277.7132586 2.7138125 102.33325 5.691620e-78
## 2    cloudlow   0.6569496 0.1328456   4.94521 5.022951e-06
lowcg <- ggplot(combine,aes(x=cloudlow,y=temperature))+geom_point()+
  xlab("Low Cloud Coverage")+ylab("Temperature")+
  geom_abline(intercept=277.7133,slope=0.6569,col="red")
lowcg

Temperature vs. Middle Cloud Coverage

temp_midc <- lm(temperature ~ cloudmid,data = combine)
temp_midc %>% tidy() %>% as.data.frame()
##          term   estimate std.error statistic      p.value
## 1 (Intercept) 313.376097 2.2739709 137.81008 5.642312e-87
## 2    cloudmid  -1.167782 0.1110144 -10.51919 4.670486e-16
midcg <- ggplot(combine,aes(x=cloudmid,y=temperature))+geom_point()+
  xlab("Middle Cloud Coverage")+ylab("Temperature")+
  geom_abline(intercept=313.376,slope=-1.168,col="red")
midcg

Temperature vs. High Cloud Coverage

temp_highc <- lm(temperature ~ cloudhigh,data = combine)
temp_highc %>% tidy() %>% as.data.frame()
##          term    estimate std.error  statistic      p.value
## 1 (Intercept) 298.8933145 1.7554467 170.266243 2.193430e-93
## 2   cloudhigh  -0.8519088 0.1541152  -5.527739 5.221285e-07
highcg <- ggplot(combine,aes(x=cloudhigh,y=temperature))+geom_point()+
  xlab("High Cloud Coverage")+ylab("Temperature")+
  geom_abline(intercept=298.8933,slope=-0.8519,col="red")
highcg

Temperature vs. Ozone

temp_ozone <- lm(temperature ~ ozone,data = combine)
temp_ozone %>% tidy() %>% as.data.frame()
##          term    estimate   std.error statistic      p.value
## 1 (Intercept) 337.8292030 12.16953566 27.760238 1.634629e-39
## 2       ozone  -0.1525832  0.03904494 -3.907888 2.124625e-04
ozoneg <- ggplot(combine,aes(x=ozone,y=temperature))+geom_point()+
  xlab("Ozone Level")+ylab("Temperature")+
  geom_abline(intercept=337.8292,slope=-0.1526,col="red")
ozoneg

Temperature vs. Surface Temperature

temp_surftemp <- lm(temperature ~ surftemp,data = combine)
temp_surftemp %>% tidy() %>% as.data.frame()
##          term   estimate  std.error statistic      p.value
## 1 (Intercept) 86.0301279 10.9520922  7.855132 3.376424e-11
## 2    surftemp  0.7002365  0.0374963 18.674818 6.705231e-29
surfg <- ggplot(combine,aes(x=surftemp,y=temperature))+geom_point()+
  xlab("Surface Temperature")+ylab("Temperature")+
  geom_abline(intercept=86.0301,slope=0.7002,col="red")
surfg

Temperature vs. Pressure

temp_pres <- lm(temperature ~ pressure,data = combine)
temp_pres %>% tidy() %>% as.data.frame()
##          term     estimate   std.error statistic      p.value
## 1 (Intercept) 259.84483680 20.06505232 12.950120 2.938691e-20
## 2    pressure   0.03408459  0.02234757  1.525203 1.317141e-01
presg <- ggplot(combine,aes(x=pressure,y=temperature))+geom_point()+
  xlab("Atmospheric Pressure")+ylab("Temperature")+
  geom_abline(intercept=259.84484,slope=0.03408,col="red")
presg



Used to combine all graphs into one figure.

figure <- ggarrange(lowcg,midcg,highcg,ozoneg,surfg,presg ,ncol = 3,nrow=2)
figure

Regression with Multiple Variables

From the linear regressions, pressure was the only variable that did not correlate with temperature. Therefore, the multiple linear regression model will not use that variable for predictions.

model <- lm(temperature ~ cloudlow+cloudmid+cloudhigh+ozone+surftemp,data=combine)
summary(model)
## 
## Call:
## lm(formula = temperature ~ cloudlow + cloudmid + cloudhigh + 
##     ozone + surftemp, data = combine)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7001 -1.7232 -0.0064  1.7982  4.8737 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 20.43522   22.89516   0.893    0.375    
## cloudlow    -0.65061    0.10505  -6.194 4.27e-08 ***
## cloudmid     0.16998    0.10875   1.563    0.123    
## cloudhigh   -0.43951    0.08269  -5.315 1.35e-06 ***
## ozone        0.01669    0.02003   0.833    0.408    
## surftemp     0.95383    0.06855  13.915  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.427 on 66 degrees of freedom
## Multiple R-squared:  0.9261, Adjusted R-squared:  0.9205 
## F-statistic: 165.3 on 5 and 66 DF,  p-value: < 2.2e-16

Predictions

Chose 50 random data points from the NASA data set (some listed below):

temp_pred <- sample_n(dfnasa,50) 
head(temp_pred)
##         lat       long month year cloudhigh cloudlow cloudmid ozone
## 1 16.234783  -98.77391     7 1997      27.5     13.0     22.5   278
## 2 23.721739  -93.76522    11 1996       2.0     34.0      6.5   262
## 3 18.730435 -103.78261     5 1998       1.0     14.5      3.5   278
## 4 33.704348 -108.79130     1 1998       6.5      8.5     21.0   316
## 5  8.747826  -73.73043     9 2000      45.5      7.0     22.0   262
## 6 23.721739  -63.71304    11 1998       4.5     30.5      6.5   268
##   pressure surftemp temperature
## 1      945    298.3       301.0
## 2     1000    298.7       297.8
## 3      980    301.0       303.6
## 4      835    277.3       280.5
## 5      990    294.6       303.2
## 6     1000    299.2       299.2

Data frame of 50 random rows from the NASA data set.

model_usage <- temp_pred %>% select(cloudhigh,cloudlow,cloudmid,ozone,surftemp)
real_temp <- temp_pred %>% select(temperature)

head(model_usage)
##   cloudhigh cloudlow cloudmid ozone surftemp
## 1      27.5     13.0     22.5   278    298.3
## 2       2.0     34.0      6.5   262    298.7
## 3       1.0     14.5      3.5   278    301.0
## 4       6.5      8.5     21.0   316    277.3
## 5      45.5      7.0     22.0   262    294.6
## 6       4.5     30.5      6.5   268    299.2

The model_usage variable was used to find the prediction while storing the actual temperature in real_temp.

model_predictions <- model_usage %>% add_predictions(model)

head(model_predictions)
##   cloudhigh cloudlow cloudmid ozone surftemp     pred
## 1      27.5     13.0     22.5   278    298.3 292.8818
## 2       2.0     34.0      6.5   262    298.7 287.8213
## 3       1.0     14.5      3.5   278    301.0 302.8984
## 4       6.5      8.5     21.0   316    277.3 285.3877
## 5      45.5      7.0     22.0   262    294.6 284.9932
## 6       4.5     30.5      6.5   268    299.2 289.5767

Graph of Predictions

actual_preddf <- data.frame(cbind(real_temp, model_predictions$pred))
colnames(actual_preddf) = c("real","prediction")

ggplotly(ggplot(actual_preddf)+geom_point(aes(x=real,y=prediction))+
  geom_abline(intercept=0,slope=1,col="darkturquoise",size=1)+
  xlab("Real Temperature")+ylab("Predicted Temperature"))